import os
import pandas as pd

# 指定要去重的 xlsx 文件路径
file_path = './第一期result.xlsx'  # 可修改为目标文件

try:
    df = pd.read_excel(file_path)
except Exception as e:
    print(f"读取文件 {file_path} 时出错: {e}")
    exit(1)

# 处理去重逻辑
if 'unifiedSocialcreditCode' in df.columns:
    # 标记信用代码为'-'的记录
    is_dash = df['unifiedSocialcreditCode'] == '-'
    # 分离信用代码为'-'和不为'-'的记录
    dash_df = df[is_dash]
    valid_df = df[~is_dash]
    # 对信用代码不为'-'的记录使用该列去重
    valid_df_deduplicated = valid_df.drop_duplicates(subset='unifiedSocialcreditCode')
    # 合并去重后的有效记录和信用代码为'-'的记录
    dedup_df = pd.concat([valid_df_deduplicated, dash_df], ignore_index=True)
    print(f"使用'unifiedSocialcreditCode'列去重（排除'-'值）后，剩余 {len(dedup_df)} 条记录")
else:
    print("警告: 数据中不包含'unifiedSocialcreditCode'列，将使用全量数据去重")
    dedup_df = df.drop_duplicates()

# 保存到新文件
base_name = os.path.basename(file_path)
new_file = f"去重结果_{base_name}"
dedup_df.to_excel(new_file, index=False)
print(f"去重完成，结果已保存为 '{new_file}'")